Sys.setlocale('LC_ALL', 'C')
[1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=vi_VN;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=vi_VN;LC_IDENTIFICATION=C"
library(fpp2)
library(readxl) # read excel
library(lmtest) # linear model test
library(DIMORA) # BASS model
setwd('/home/hieu/BDMA/TSA/')
plot(a10, ylab='million dollars', xlab='Year', main='Antidiabetic drugs')
tsdisplay(a10) ##a general view of the data
seasonplot(a10, ylab="million dollars", xlab="Year", main="Seasonal plot: Antidiabetic drugs", year.labels=T, year.labels.left=T, col=1:20, pch=19)
Facebook example
# read the data
facebook<- read_excel("facebook.xlsx")
str(facebook)
tibble [48 x 2] (S3: tbl_df/tbl/data.frame)
$ quarter: chr [1:48] "Q3 '08" "Q4 '08" "Q1 '09" "Q2 '09" ...
$ fb : num [1:48] 100 150 197 242 305 360 431 482 550 608 ...
# create a variable 'time'
tt<- 1:NROW(facebook)
# create the variable 'fb'
fb <- facebook$fb
# make a plot
plot(tt, fb, xlab="Time", ylab="Facebook users")
# acf of variable "fb"
acf(fb)
# fit a linear regression model
fit1 <- lm(fb~ tt)
summary(fit1)
Call:
lm(formula = fb ~ tt)
Residuals:
Min 1Q Median 3Q Max
-73.058 -20.982 2.298 33.902 71.229
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 54.5363 10.9917 4.962 1e-05 ***
tt 53.6507 0.3905 137.378 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 37.48 on 46 degrees of freedom
Multiple R-squared: 0.9976, Adjusted R-squared: 0.9975
F-statistic: 1.887e+04 on 1 and 46 DF, p-value: < 2.2e-16
anova(fit1)
Analysis of Variance Table
Response: fb
Df Sum Sq Mean Sq F value Pr(>F)
tt 1 26515826 26515826 18873 < 2.2e-16 ***
Residuals 46 64629 1405
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plot of the model
plot(tt, fb, xlab="Time", ylab="Facebook users")
abline(fit1, col=3)
# check the residuals? are they autocorrelated? Test of DW
dwtest(fit1)
Durbin-Watson test
data: fit1
DW = 0.16378, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
# check the residuals
resfit1<- residuals(fit1)
plot(resfit1,xlab="Time", ylab="residuals" )
# let us do the same with a linear model for time series, so we transform the data into a 'ts' object
fb.ts <- ts(fb, frequency = 4)
ts.plot(fb.ts, type="o")
# we fit a linear model with the tslm function
fitts<- tslm(fb.ts~trend)
# obviously it gives the same results of the first model
summary(fitts)
Call:
tslm(formula = fb.ts ~ trend)
Residuals:
Min 1Q Median 3Q Max
-73.058 -20.982 2.298 33.902 71.229
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 54.5363 10.9917 4.962 1e-05 ***
trend 53.6507 0.3905 137.378 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 37.48 on 46 degrees of freedom
Multiple R-squared: 0.9976, Adjusted R-squared: 0.9975
F-statistic: 1.887e+04 on 1 and 46 DF, p-value: < 2.2e-16
dwtest(fitts)
Durbin-Watson test
data: fitts
DW = 0.16378, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
Linear regression for iMac
apple<- read_excel("apple.xlsx")
str(apple)
tibble [56 x 4] (S3: tbl_df/tbl/data.frame)
$ iPhone: num [1:56] 0.27 1.12 2.32 1.7 0.72 6.89 4.36 3.79 5.21 7.37 ...
$ iPad : num [1:56] 3.27 4.19 7.33 4.69 9.25 ...
$ iPod : num [1:56] 14.04 8.53 8.11 8.73 21.07 ...
$ iMac : num [1:56] 1.25 1.11 1.33 1.61 1.61 ...
imac <- apple$iMac
#data visualization
plot(imac,type="l", xlab="quarter", ylab="iMac sales")
#variable tt for a linear model
tt<- 1:NROW(apple)
# linear model
fit2 <- lm(imac~tt)
summary(fit2)
Call:
lm(formula = imac ~ tt)
Residuals:
Min 1Q Median 3Q Max
-1.70990 -0.57455 0.01084 0.41485 1.66010
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.914625 0.200379 9.555 3.34e-13 ***
tt 0.064931 0.006116 10.617 7.87e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7397 on 54 degrees of freedom
Multiple R-squared: 0.6761, Adjusted R-squared: 0.6701
F-statistic: 112.7 on 1 and 54 DF, p-value: 7.871e-15
plot(imac,type="l", xlab="quarter", ylab="iMac sales")
abline(fit2, col=3)
dwtest(fit2)
Durbin-Watson test
data: fit2
DW = 0.76206, p-value = 3.628e-08
alternative hypothesis: true autocorrelation is greater than 0
# check the residuals
res2<- residuals(fit2)
plot(res2, xlab="quarter", ylab="residuals", type="l")
acf(res2)
# data transformed as time series
mac.ts<-ts(imac, frequency=4)
# Model with trend and seasonality
fit3 <- tslm(mac.ts~ trend+season)
summary(fit3)
Call:
tslm(formula = mac.ts ~ trend + season)
Residuals:
Min 1Q Median 3Q Max
-1.60158 -0.42293 -0.00687 0.54973 1.42797
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.155255 0.236078 9.129 2.62e-12 ***
trend 0.064591 0.005613 11.507 8.68e-16 ***
season2 -0.640448 0.256052 -2.501 0.0156 *
season3 -0.460039 0.256237 -1.795 0.0785 .
season4 0.176727 0.256544 0.689 0.4940
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6773 on 51 degrees of freedom
Multiple R-squared: 0.7436, Adjusted R-squared: 0.7235
F-statistic: 36.97 on 4 and 51 DF, p-value: 1.695e-14
# check the residuals
res3 <- residuals(fit3)
plot(res3, ylab="residuals")
dwtest(fit3)
Durbin-Watson test
data: fit3
DW = 0.44182, p-value = 5.722e-13
alternative hypothesis: true autocorrelation is greater than 0
# plot of the model
plot(mac.ts, ylab="iMac sales", xlab="Time")
lines(fitted(fit3), col=2)
Data on Australian beer production
beer<- ausbeer
beer
Qtr1 Qtr2 Qtr3 Qtr4
1956 284 213 227 308
1957 262 228 236 320
1958 272 233 237 313
1959 261 227 250 314
1960 286 227 260 311
1961 295 233 257 339
1962 279 250 270 346
1963 294 255 278 363
1964 313 273 300 370
1965 331 288 306 386
1966 335 288 308 402
1967 353 316 325 405
1968 393 319 327 442
1969 383 332 361 446
1970 387 357 374 466
1971 410 370 379 487
1972 419 378 393 506
1973 458 387 427 565
1974 465 445 450 556
1975 500 452 435 554
1976 510 433 453 548
1977 486 453 457 566
1978 515 464 431 588
1979 503 443 448 555
1980 513 427 473 526
1981 548 440 469 575
1982 493 433 480 576
1983 475 405 435 535
1984 453 430 417 552
1985 464 417 423 554
1986 459 428 429 534
1987 481 416 440 538
1988 474 440 447 598
1989 467 439 446 567
1990 485 441 429 599
1991 464 424 436 574
1992 443 410 420 532
1993 433 421 410 512
1994 449 381 423 531
1995 426 408 416 520
1996 409 398 398 507
1997 432 398 406 526
1998 428 397 403 517
1999 435 383 424 521
2000 421 402 414 500
2001 451 380 416 492
2002 428 408 406 506
2003 435 380 421 490
2004 435 390 412 454
2005 416 403 408 482
2006 438 386 405 491
2007 427 383 394 473
2008 420 390 410 488
2009 415 398 419 488
2010 414 374
plot(beer)
Acf(beer)
#take a portion of data and fit a linear model with tslm
beer1<- window(ausbeer, start=1992, end=2006 -.1)
beer1
Qtr1 Qtr2 Qtr3 Qtr4
1992 443 410 420 532
1993 433 421 410 512
1994 449 381 423 531
1995 426 408 416 520
1996 409 398 398 507
1997 432 398 406 526
1998 428 397 403 517
1999 435 383 424 521
2000 421 402 414 500
2001 451 380 416 492
2002 428 408 406 506
2003 435 380 421 490
2004 435 390 412 454
2005 416 403 408 482
plot(beer1)
m1<- tslm(beer1~ trend+ season)
summary(m1)
Call:
tslm(formula = beer1 ~ trend + season)
Residuals:
Min 1Q Median 3Q Max
-44.024 -8.390 0.249 8.619 23.320
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 441.8141 4.5338 97.449 < 2e-16 ***
trend -0.3820 0.1078 -3.544 0.000854 ***
season2 -34.0466 4.9174 -6.924 7.18e-09 ***
season3 -18.0931 4.9209 -3.677 0.000568 ***
season4 76.0746 4.9268 15.441 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.01 on 51 degrees of freedom
Multiple R-squared: 0.921, Adjusted R-squared: 0.9149
F-statistic: 148.7 on 4 and 51 DF, p-value: < 2.2e-16
fit<- fitted(m1)
plot(beer1)
lines(fitted(m1), col=2)
fore <- forecast(m1)
plot(fore)
# forecasts from regression model for beer production, The dark shaded region shows 80% prediction intervals and the light shaded 95% prediction intervals (range of values the random variable could take with relatively high probability).
# analysis of residuals
res<- residuals(m1)
plot(res)
# the form of residuals seems to indicate the presence of negative autocorrelation
Acf(res)
Data on quarterly percentage change in US consumption, income, production, savings, unemployment
uschange<- uschange
str(uschange)
Time-Series [1:187, 1:5] from 1970 to 2016: 0.616 0.46 0.877 -0.274 1.897 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:5] "Consumption" "Income" "Production" "Savings" ...
plot(uschange)
autoplot(uschange)
pairs(uschange)
#different way of seeing the same series
cons<- uschange[,1]
inc<- uschange[,2]
prod<- uschange[,3]
sav<- uschange[,4]
unem<- uschange[,5]
# consider the series of consumption as dependent variable and study with the other explanatory variables in a multiple regression model
fit.cons<- tslm(cons~inc+prod+sav+unem)
summary(fit.cons)
Call:
tslm(formula = cons ~ inc + prod + sav + unem)
Residuals:
Min 1Q Median 3Q Max
-0.88296 -0.17638 -0.03679 0.15251 1.20553
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.26729 0.03721 7.184 1.68e-11 ***
inc 0.71449 0.04219 16.934 < 2e-16 ***
prod 0.04589 0.02588 1.773 0.0778 .
sav -0.04527 0.00278 -16.287 < 2e-16 ***
unem -0.20477 0.10550 -1.941 0.0538 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3286 on 182 degrees of freedom
Multiple R-squared: 0.754, Adjusted R-squared: 0.7486
F-statistic: 139.5 on 4 and 182 DF, p-value: < 2.2e-16
AIC(fit.cons)
[1] 121.385
plot(cons)
lines(fitted(fit.cons), col=2)
res<- residuals(fit.cons)
plot(res)
acf(res)
Acf(res) # note the difference
# we remove the 'production' variable
fit.cons1<- tslm(cons~inc+sav+unem)
summary(fit.cons1)
Call:
tslm(formula = cons ~ inc + sav + unem)
Residuals:
Min 1Q Median 3Q Max
-0.82491 -0.17737 -0.02716 0.14406 1.25913
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.281017 0.036607 7.677 9.47e-13 ***
inc 0.730497 0.041455 17.622 < 2e-16 ***
sav -0.045990 0.002766 -16.629 < 2e-16 ***
unem -0.341346 0.072526 -4.707 4.96e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3305 on 183 degrees of freedom
Multiple R-squared: 0.7497, Adjusted R-squared: 0.7456
F-statistic: 182.7 on 3 and 183 DF, p-value: < 2.2e-16
Exercise with scenario hypotheses
# Fit the model again (by using the data in a different way)
fit.consBest <- tslm(
Consumption ~ Income + Savings + Unemployment,
data = uschange)
h <- 4 #window for prediction
# we assume a constant increase of 1 and 0.5% for income and savings and no change for unemployment
newdata <- data.frame(
Income = c(1, 1, 1, 1),
Savings = c(0.5, 0.5, 0.5, 0.5),
Unemployment = c(0, 0, 0, 0))
# forecasts
fcast.up <- forecast(fit.consBest, newdata = newdata)
# we assume a constant decrease of 1 and 0.5% for income and savings and no change for unemployment
newdata <- data.frame(
Income = rep(-1, h),
Savings = rep(-0.5, h),
Unemployment = rep(0, h))
fcast.down <- forecast(fit.consBest, newdata = newdata)
# graphical comparison of these two scenarios
autoplot(uschange[, 1]) +
ylab("% change in US consumption") +
autolayer(fcast.up, PI = TRUE, series = "increase") +
autolayer(fcast.down, PI = TRUE, series = "decrease") +
guides(colour = guide_legend(title = "Scenario"))
# Music data (RIAA)
music<- read_excel("music.xlsx")
str(music)
tibble [49 x 3] (S3: tbl_df/tbl/data.frame)
$ year : num [1:49] 1973 1974 1975 1976 1977 ...
$ cd : num [1:49] NA NA NA NA NA NA NA NA NA 0 ...
$ cassette: num [1:49] 15 15.3 16.2 21.8 36.9 ...
# create the variable cassette
cassette<- music$cassette[1:36]
# some simple plots
plot(cassette, type="b")
plot(cumsum(cassette), type="b")
# a better plot of the yearly time series
plot(cassette, type= "b",xlab="Year", ylab="Annual sales", pch=16, lty=3, xaxt="n", cex=0.6)
axis(1, at=c(1,10,19,28,37), labels=music$year[c(1,10,19,28,37)])
# we estimate a simple Bass Model
bm_cass<-BM(cassette,display = T)
summary(bm_cass)
Call: ( Standard Bass Model )
BM(series = cassette, display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-86.332 -43.233 -11.661 -7.027 35.917 63.537
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 45.85668 on 33 degrees of freedom
Multiple R-squared: 0.999883 Residual sum of squares: 69393.57
# # prediction (out-of-sample)
# pred_bmcas<- predict(bm_cass, newx=c(1:50))
# pred.instcas<- make.instantaneous(pred_bmcas)
#
# # plot of fitted model
# plot(cassette, type= "b",xlab="Year", ylab="Annual sales", pch=16, lty=3, xaxt="n", cex=0.6)
# axis(1, at=c(1,10,19,28,37), labels=music$year[c(1,10,19,28,37)])
# lines(pred.instcas, lwd=2, col=2)
# prediction (out-of-sample)
pred_bmcas<- predict(bm_cass, newx=c(1:50))
pred.instcas<- make.instantaneous(pred_bmcas)
# plot of fitted model
plot(cassette, type= "b",xlab="Year", ylab="Annual sales", pch=16, lty=3, xaxt="n", cex=0.6)
axis(1, at=c(1,10,19,28,37), labels=music$year[c(1,10,19,28,37)])
lines(pred.instcas, lwd=2, col=2)
# we estimate the model with 50% of the data
bm_cass50<-BM(cassette[1:18],display = T)
summary(bm_cass50)
Call: ( Standard Bass Model )
BM(series = cassette[1:18], display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-23.498 -12.829 -7.279 -2.003 2.400 42.110
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 20.35052 on 15 degrees of freedom
Multiple R-squared: 0.999839 Residual sum of squares: 6212.152
pred_bmcas50<- predict(bm_cass50, newx=c(1:50))
pred.instcas50<- make.instantaneous(pred_bmcas50)
plot(cassette, type= "b",xlab="Year", ylab="Annual sales", pch=16, lty=3, xaxt="n", cex=0.6)
axis(1, at=c(1,10,19,28,37), labels=music$year[c(1,10,19,28,37)])
lines(pred.instcas50, lwd=2, col=2)
# we estimate the model with 25% of the data
bm_cass75<-BM(cassette[1:9],display = T)
summary(bm_cass75)
Call: ( Standard Bass Model )
BM(series = cassette[1:9], display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-5.9811 -0.8602 1.2852 1.0976 3.6938 7.2401
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 4.974005 on 6 degrees of freedom
Multiple R-squared: 0.999527 Residual sum of squares: 148.4444
pred_bmcas75<- predict(bm_cass75, newx=c(1:50))
pred.instcas75<- make.instantaneous(pred_bmcas75)
# Comparison between models (instantaneous)
plot(cassette, type= "b",xlab="Year", ylab="Annual sales", pch=16, lty=3, xaxt="n", cex=0.6)
axis(1, at=c(1,10,19,28,37), labels=music$year[c(1,10,19,28,37)])
lines(pred.instcas75, lwd=2, col=2)
lines(pred.instcas50, lwd=2, col=3)
lines(pred.instcas, lwd=2, col=4)
# Comparison between models (cumulative)
plot(cumsum(cassette), type= "b",xlab="Year", ylab="Annual sales", pch=16, lty=3, xaxt="n", cex=0.6)
axis(1, at=c(1,10,19,28,37), labels=music$year[c(1,10,19,28,37)])
lines(pred_bmcas75, lwd=2, col=2)
lines(pred_bmcas50, lwd=2, col=3)
lines(pred_bmcas, lwd=2, col=4)
###exercise: try the same with the CD time series
Twitter (revenues)
twitter<- read_excel("twitter.xlsx")
length(twitter$twitter)
[1] 46
plot(twitter$twitter, type= "b",xlab="Quarter", ylab="Quarterly revenues", pch=16, lty=3, xaxt="n", cex=0.6)
axis(1, at=c(1,10,19,28,37,46), labels=twitter$quarter[c(1,10,19,28,37,46)])
###BM
tw<- (twitter$twitter)
Acf(tw)
bm_tw<-BM(tw,display = T)
summary(bm_tw)
Call: ( Standard Bass Model )
BM(series = tw, display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-744.1 -515.0 -160.2 -114.2 308.9 655.6
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 479.1877 on 43 degrees of freedom
Multiple R-squared: 0.998446 Residual sum of squares: 9873697
pred_bmtw<- predict(bm_tw, newx=c(1:60))
pred.insttw<- make.instantaneous(pred_bmtw)
plot(cumsum(tw), type= "b",xlab="Quarter", ylab="Cumulative revenues", pch=16, lty=3, cex=0.6, xlim=c(1,60), ylim=c(0,40000))
lines(pred_bmtw, lwd=2, col=2)
plot(tw, type= "b",xlab="Quarter", ylab="Quarterly revenues", pch=16, lty=3, cex=0.6, xlim=c(1,60))
lines(pred.insttw, lwd=2, col=2)
# GBMr1
GBMr1tw<- GBM(tw,shock = "rett",nshock = 1,prelimestimates = c(4.463368e+04, 1.923560e-03, 9.142022e-02, 24,38,-0.1))
# GBMe1
GBMe1tw<- GBM(tw,shock = "exp",nshock = 1,prelimestimates = c(4.463368e+04, 1.923560e-03, 9.142022e-02, 12,-0.1,0.1))
summary(GBMe1tw)
Call: ( Generalized Bass model with 1 Exponential shock )
GBM(series = tw, shock = "exp", nshock = 1, prelimestimates = c(44633.68,
0.00192356, 0.09142022, 12, -0.1, 0.1))
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-299.96 -109.20 -30.38 -12.36 69.07 299.52
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 136.3981 on 40 degrees of freedom
Multiple R-squared: 0.999765 Residual sum of squares: 744177.4
pred_GBMe1tw<- predict(GBMe1tw, newx=c(1:60))
pred_GBMe1tw.inst<- make.instantaneous(pred_GBMe1tw)
plot(cumsum(tw), type= "b",xlab="Quarter", ylab="Cumulative revenues", pch=16, lty=3, cex=0.6, xlim=c(1,60), ylim=c(0,50000))
lines(pred_GBMe1tw, lwd=2, col=2)
plot(tw, type= "b",xlab="Quarter", ylab="Quarterly revenues", pch=16, lty=3, cex=0.6, xlim=c(1,60))
lines(pred_GBMe1tw.inst, lwd=2, col=2)